This project demonstrates a complete data engineering and data science workflow: - Data Engineering: Automated ETL pipeline extracting data from NHTSA FARS API - Database Management: SQLite database with optimized queries - Feature Engineering: 25+ derived features for predictive modeling - Machine Learning: Multiple algorithms trained and evaluated - Deployment: Production-ready model with API capabilities
The data pipeline consists of three main stages implemented in modular R scripts:
Extract (01_extract_data.R): - Connects
to NHTSA FARS API - Retrieves Minnesota crash data (State Code: 27) -
Handles multiple datasets: Accident, Person, Vehicle
Transform (02_transform_data.R): - Data
quality validation - Feature engineering (temporal, geographic,
behavioral) - Minnesota-specific features (winter conditions, metro
areas)
Load (03_load_data.R): - Writes to
SQLite database - Creates indexed tables for performance - Generates
model-ready views
# Connect to the database created by ETL pipeline
con <- dbConnect(SQLite(), "data/minnesota_fars.db")
# Get summary statistics
total_crashes <- dbGetQuery(con, "SELECT COUNT(*) as count FROM fatal_crashes")$count
total_fatalities <- dbGetQuery(con, "SELECT SUM(total_fatalities) as total FROM fatal_crashes")$total
year_range <- dbGetQuery(con, "SELECT MIN(crash_year) as min_year, MAX(crash_year) as max_year FROM fatal_crashes")
# Display overview
data.frame(
Metric = c("Total Fatal Crashes", "Total Fatalities", "Year Range", "Average Fatalities per Crash"),
Value = c(
format(total_crashes, big.mark = ","),
format(total_fatalities, big.mark = ","),
paste(year_range$min_year, "-", year_range$max_year),
round(total_fatalities / total_crashes, 2)
)
) %>% kable(caption = "Minnesota FARS Database Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Metric | Value |
|---|---|
| Total Fatal Crashes | 450 |
| Total Fatalities | 655 |
| Year Range | 2015 - 2023 |
| Average Fatalities per Crash | 1.46 |
# Query data from database
crash_data <- dbGetQuery(con, "SELECT * FROM fatal_crashes")
cat("Loaded", nrow(crash_data), "crash records\n")## Loaded 450 crash records
## Features available: 31
# Display sample
head(crash_data, 10) %>%
select(case_id, crash_year, crash_datetime, county, total_fatalities,
winter_crash, alcohol_involved, high_severity) %>%
kable(caption = "Sample Crash Records") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| case_id | crash_year | crash_datetime | county | total_fatalities | winter_crash | alcohol_involved | high_severity |
|---|---|---|---|---|---|---|---|
| 201500001 | 2015 | 1442463960 | Anoka | 3 | 0 | 0 | 1 |
| 201500002 | 2015 | 1447562280 | Carver | 2 | 1 | 0 | 0 |
| 201500003 | 2015 | 1421940540 | Hennepin | 1 | 1 | 0 | 0 |
| 201500004 | 2015 | 1448673120 | Hennepin | 1 | 1 | 0 | 0 |
| 201500005 | 2015 | 1429108980 | Carver | 1 | 0 | 0 | 0 |
| 201500006 | 2015 | 1425263880 | Hennepin | 2 | 1 | 0 | 0 |
| 201500007 | 2015 | 1423216260 | Carver | 1 | 1 | 0 | 0 |
| 201500008 | 2015 | 1423740480 | Ramsey | 1 | 1 | 0 | 0 |
| 201500009 | 2015 | 1438107120 | Dakota | 1 | 0 | 1 | 0 |
| 201500010 | 2015 | 1422483900 | Carver | 1 | 1 | 0 | 0 |
yearly_stats <- crash_data %>%
group_by(crash_year) %>%
summarize(
crashes = n(),
fatalities = sum(total_fatalities),
winter_crashes = sum(winter_crash, na.rm = TRUE),
alcohol_crashes = sum(alcohol_involved, na.rm = TRUE)
)
p1 <- plot_ly(yearly_stats, x = ~crash_year) %>%
add_bars(y = ~crashes, name = "Total Crashes", marker = list(color = '#1f77b4')) %>%
add_lines(y = ~fatalities, name = "Fatalities", yaxis = "y2",
line = list(color = '#ff7f0e', width = 3)) %>%
layout(
title = "Minnesota Fatal Crashes: Yearly Trends",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Crashes"),
yaxis2 = list(title = "Total Fatalities", overlaying = "y", side = "right"),
hovermode = "x unified"
)
p1seasonal_stats <- crash_data %>%
group_by(season) %>%
summarize(
crashes = n(),
fatalities = sum(total_fatalities),
avg_severity = mean(total_fatalities)
) %>%
mutate(season = factor(season, levels = c("Winter", "Spring", "Summer", "Fall")))
p2 <- plot_ly(seasonal_stats, x = ~season, y = ~crashes, type = 'bar',
marker = list(color = c('#3498db', '#2ecc71', '#f39c12', '#e67e22')),
text = ~paste("Crashes:", crashes, "<br>Fatalities:", fatalities),
hoverinfo = 'text') %>%
layout(
title = "Fatal Crashes by Season",
xaxis = list(title = ""),
yaxis = list(title = "Number of Crashes")
)
p2factors <- data.frame(
Factor = c("Alcohol Involved", "Speed Related", "Dark Conditions",
"Icy Roads", "Adverse Weather", "Work Zone", "Pedestrian Involved"),
Count = c(
sum(crash_data$alcohol_involved, na.rm = TRUE),
sum(crash_data$speed_related, na.rm = TRUE),
sum(crash_data$dark_conditions, na.rm = TRUE),
sum(crash_data$icy_road, na.rm = TRUE),
sum(crash_data$adverse_weather, na.rm = TRUE),
sum(crash_data$work_zone, na.rm = TRUE),
sum(crash_data$pedestrian_involved, na.rm = TRUE)
)
) %>%
mutate(Percentage = round(Count / nrow(crash_data) * 100, 1)) %>%
arrange(desc(Count))
p3 <- plot_ly(factors, x = ~Count, y = ~reorder(Factor, Count),
type = 'bar', orientation = 'h',
marker = list(color = '#e74c3c'),
text = ~paste0(Count, " (", Percentage, "%)"),
textposition = 'outside') %>%
layout(
title = "Contributing Factors in Fatal Crashes",
xaxis = list(title = "Number of Crashes"),
yaxis = list(title = ""),
margin = list(l = 150)
)
p3# Get crashes with valid coordinates
map_data <- crash_data %>%
filter(!is.na(latitude), !is.na(longitude)) %>%
filter(latitude > 43, latitude < 50, longitude > -97, longitude < -89)
# Color by severity
pal <- colorFactor(
palette = c("#FFA500", "#DC143C"),
domain = c(FALSE, TRUE)
)
leaflet(map_data) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
lng = ~longitude,
lat = ~latitude,
radius = ~ifelse(high_severity, 6, 3),
color = ~pal(high_severity),
fillOpacity = 0.7,
stroke = TRUE,
weight = 1,
popup = ~paste0(
"<b>Date:</b> ", crash_datetime, "<br>",
"<b>County:</b> ", county, "<br>",
"<b>Fatalities:</b> ", total_fatalities, "<br>",
"<b>Season:</b> ", season, "<br>",
"<b>Weather:</b> ", ifelse(adverse_weather, "Adverse", "Clear"), "<br>",
"<b>Alcohol:</b> ", ifelse(alcohol_involved, "Yes", "No")
)
) %>%
addLegend(
position = "bottomright",
pal = pal,
values = c(FALSE, TRUE),
labels = c("Low Severity (1-2 deaths)", "High Severity (3+ deaths)"),
title = "Crash Severity"
) %>%
setView(lng = -94.5, lat = 46.5, zoom = 7)Geographic Insights: - Mapped crashes: r format(nrow(map_data), big.mark=“,”) - High-severity crashes concentrated in metro areas and major highway corridors - Rural areas show different crash patterns
The modeling pipeline was executed using scripts 04-07:
# Modeling Pipeline (already executed)
source("src/run_modeling_pipeline.R")
# Run complete modeling
results <- run_modeling_pipeline()Training Configuration:
# Load evaluation results from modeling pipeline
comparison <- read.csv("output/model_comparison.csv")
comparison %>%
arrange(desc(AUC)) %>%
kable(digits = 4, caption = "Model Performance Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| Model | Accuracy | Sensitivity | Specificity | Precision | F1_Score | AUC |
|---|---|---|---|---|---|---|
| logistic | 1.0000 | 1.0000 | 1 | 1 | 1.0 | 1 |
| random_forest | 0.9663 | 0.6667 | 1 | 1 | 0.8 | 1 |
| decision_tree | 1.0000 | 1.0000 | 1 | 1 | 1.0 | 1 |
Best Performing Model: r comparison$Model[1]
# Try to load feature importance plot
if (file.exists("output/evaluation/feature_importance_rf.png")) {
knitr::include_graphics("output/evaluation/feature_importance_rf.png")
} else {
cat("Feature importance visualization not available.")
}Top Predictive Features:
# Calculate key statistics for insights
winter_impact <- crash_data %>%
group_by(winter_crash) %>%
summarize(
crashes = n(),
avg_fatalities = mean(total_fatalities),
high_severity_rate = mean(high_severity, na.rm = TRUE)
)
# FIXED: Calculate winter increase properly
if (nrow(winter_impact) == 2) {
winter_increase <- round(
(winter_impact$high_severity_rate[winter_impact$winter_crash == TRUE] /
winter_impact$high_severity_rate[winter_impact$winter_crash == FALSE] - 1) * 100, 1
)
} else {
winter_increase <- 0
}
alcohol_pct <- round(sum(crash_data$alcohol_involved, na.rm = TRUE) / nrow(crash_data) * 100, 1)
rural_pct <- round(sum(crash_data$rural_crash, na.rm = TRUE) / nrow(crash_data) * 100, 1)
metro_crashes <- sum(crash_data$metro_area, na.rm = TRUE)Finding 1: Winter Weather Impact
Finding 2: Alcohol Involvement
Finding 3: Rural vs Urban Disparities
For Minnesota Department of Transportation:
For Law Enforcement:
For Vision Zero Initiative: